import re
import string
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch
from collections import defaultdict
from collections import Counter
import keras
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import (LSTM,
Embedding,
BatchNormalization,
Dense,
TimeDistributed,
Dropout,
Bidirectional,
Flatten,
GlobalMaxPool1D)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam
from sklearn.metrics import (
precision_score,
recall_score,
f1_score,
classification_report,
accuracy_score
)
df = pd.read_csv('Eluvio_DS_Challenge.csv')
df.head()
df.shape
df.isnull().sum()
There are no missing values in the dataset
columns = df.columns
print('total:', len(df))
for column in columns:
print(column)
print(len(df[column].unique()))
print()
# Create new dataframe including two columns
df = df[['title', 'over_18']]
df.head()
# insert the new column containing the number of words in title column
df['title_len'] = df['title'].apply(lambda x: len(x.split(' ')))
df.head()
max(df['title_len'])
balance_count = df.groupby('over_18')['over_18'].agg('count').values
balance_count
fig = go.Figure()
fig.add_trace(go.Bar(
x=['False'],
y=[balance_count[0]],
name='False',
text=[balance_count[0]],
textposition='auto',
marker_color = 'royalblue'
))
fig.add_trace(go.Bar(
x=['True'],
y=[balance_count[1]],
name='True',
text=[balance_count[1]],
textposition='auto',
marker_color = 'seagreen'
))
fig.update_layout(
title='<span style="font-size:32px; font-family:Times New Roman">Dataset distribution by target</span>'
)
fig.show()
# cleaning the corpus
def clean_text(text):
'''Make text lowercase, remove text in square brackets,remove links,remove punctuation
and remove words containing numbers.'''
text = str(text).lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
return text
# insert new column - clean text
df['title_clean'] = df['title'].apply(clean_text)
df.head()
Stopwords are commonly used words in English which have no contextual meaning in an sentence. So therefore we remove them before classification.
# remove stopwords
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords
def remove_stopwords(text):
text = ' '.join(word for word in text.split(' ') if word not in stop_words)
return text
df['title_clean'] = df['title_clean'].apply(remove_stopwords)
df.head()
usually refers to a process that chops off the ends of words in the hope of achieving goal correctly most of the time and often includes the removal of derivational affixes.
# Stemming
stemmer = nltk.SnowballStemmer("english")
def stemm_text(text):
text = ' '.join(stemmer.stem(word) for word in text.split(' '))
return text
df['title_clean'] = df['title_clean'].apply(stemm_text)
df.head()
# insert new encoded target column which are 0 or 1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['over_18'])
df['target'] = le.transform(df['over_18'])
df.head()
wc = WordCloud(background_color = 'white',
max_words = 200)
wc.generate(' '.join(text for text in df.loc[df['target'] == 0, 'title_clean']))
plt.figure(figsize=(15,8))
plt.title('Top words for not over_18 titles',
fontdict={'size': 22, 'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()
wc = WordCloud(background_color = 'white',
max_words = 200)
wc.generate(' '.join(text for text in df.loc[df['target'] == 1, 'title_clean']))
plt.figure(figsize=(15,8))
plt.title('Top words for over_18 titles',
fontdict={'size': 22, 'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()
We need to perform tokenization - the processing of segmenting text into sentences of words. The benefit of tokenization is that it gets the text into a format that is easier to convert to raw numbers, which can actually be used for processing.
feature = df['title_clean']
target = df['target']
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(feature)
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length
longest_train = max(feature, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))
length_long_sentence
longest_train
def embed(corpus):
return word_tokenizer.texts_to_sequences(corpus)
padded_texts = pad_sequences(embed(feature), padding = 'post', maxlen = length_long_sentence)
padded_texts
len(padded_texts)
target
As I said before, the classes are extremely imbalanced, it is necessary to perform over-sampling using SMOTE (Synthetic Minority Over-sampling Technique).
import imblearn
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(padded_texts, target)
X
len(X)
y
print(sum(y == 0))
print(sum(y == 1))
To obtain a vector representation for words we can use an unsupervised learning algorithm called GloVe (Global Vectors for Word Representation), which focuses on words co-occurrences over the whole corpus. Its embeddings relate to the probabilities that two words appear together.
Word embeddings are basically a form of word representation that bridges the human understanding of language to that of a machine. They have learned representations of text in an n-dimensional space where words that have the same meaning have a similar representation. Meaning that two similar words are represented by almost similar vectors that are very closely placed in a vector space.
Thus when using word embeddings, all individual words are represented as real-valued vectors in a predefined vector space. Each word is mapped to one vector and the vector values are learned in a way that resembles a neural network.
embeddings_dictionary = dict()
embedding_dim = 100
#load GloVe 100D embeddings
with open('glove.6B.100d.txt') as fp:
for line in fp.readlines():
records = line.split()
word = records[0]
vector_dimensions = np.asarray(records[1:], dtype='float32')
embeddings_dictionary [word] = vector_dimensions
# Now we will load embedding vectors of those words that appear in the
# Glove dictionary. Others will be initialized to 0.
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in word_tokenizer.word_index.items():
embedding_vector = embeddings_dictionary.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
embedding_matrix
embedding_matrix.shape
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
def glove_lstm():
model = Sequential()
model.add(Embedding(input_dim = embedding_matrix.shape[0], output_dim = embedding_matrix.shape[1],
weights = [embedding_matrix], input_length = length_long_sentence))
model.add(Bidirectional(LSTM(length_long_sentence, return_sequences = True,
recurrent_dropout=0.2)))
model.add(GlobalMaxPool1D())
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(length_long_sentence, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(length_long_sentence, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
return model
model = glove_lstm()
model.summary()
import tensorflow as tf
early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 2,
restore_best_weights = True)
history = model.fit(X_train, y_train, epochs = 20, batch_size = 256, shuffle = True,
validation_split = 0.1, verbose = 1, callbacks = [early_stop])
# Let's plot the results
def plot_learning_curves(history, arr):
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
for idx in range(2):
ax[idx].plot(history.history[arr[idx][0]])
ax[idx].plot(history.history[arr[idx][1]])
ax[idx].legend([arr[idx][0], arr[idx][1]],fontsize=18)
ax[idx].set_xlabel('A ',fontsize=16)
ax[idx].set_ylabel('B',fontsize=16)
ax[idx].set_title(arr[idx][0] + ' X ' + arr[idx][1],fontsize=16)
plot_learning_curves(history, [['loss', 'val_loss'], ['accuracy', 'val_accuracy']])
train_score = model.evaluate(X_train, y_train)
test_score = model.evaluate(X_test, y_test)
# Confusion matrix
pred = model.predict(X_test)
binary_predictions = []
for i in pred:
if i >= 0.5:
binary_predictions.append(1)
else:
binary_predictions.append(0)
print('Accuracy on testing set: ', accuracy_score(binary_predictions, y_test))
print('Precision on testing set: ', precision_score(binary_predictions, y_test))
print('Recall on testing set: ', recall_score(binary_predictions, y_test))
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(binary_predictions, y_test, normalize = 'all')
plt.figure(figsize=(15, 9))
ax = plt.subplot()
sns.heatmap(matrix, annot = True, ax = ax)
# labels, title and ticks
ax.set_xlabel('Predicted Labels', size = 20)
ax.set_ylabel('True Labels', size = 20)
ax.set_title('Confusion Matrix', size = 20)
ax.xaxis.set_ticklabels([0,1], size = 15)
ax.yaxis.set_ticklabels([0,1], size = 15)